ggplot2 allows you to:
Syntax for ggplot2 visualizations will look like:
library(ggplot2)
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = class)) +
xlab("Engine Displacement (liters)") +
ylab("Highway Mileage") +
ggtitle("Fuel Economy vs. Engine Displacement") +
theme_bw()
Load these packages to follow along with this tutorial
library(ggplot2) library(readxl) library(dplyr)
## ## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats': ## ## filter, lag
## The following objects are masked from 'package:base': ## ## intersect, setdiff, setequal, union
Import the following data sets from the data folder
supermarket <- read_excel("data/Supermarket Transactions.xlsx", sheet = "Data")
facebook <- read.delim("data/facebook.tsv")
reddit <- read.csv("data/reddit.csv")
race <- read.csv("data/race-comparison.csv")
ggplot(data = supermarket) ggplot(data = supermarket, aes(x = `Purchase Date`, y = Revenue))
## ## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr': ## ## combine
To display the data we need to tell ggplot what to draw
geom_histogram() |
histogram |
geom_freqpoly() |
frequency polygon |
geom_bar() |
bar chart |
geom_point() |
scatter plot |
geom_line() |
line chart |
geom_boxplot() |
boxplot |
Check out all the available geoms at docs.ggplot2.org/current
ggplot(data = supermarket, aes(x = Revenue)) +
geom_histogram()
ggplot(data = supermarket, aes(x = Revenue)) +
geom_freqpoly()
ggplot(data = supermarket, aes(x = Revenue)) +
geom_density()
ggplot(data = supermarket, aes(x = Revenue)) +
geom_histogram(bins = 100, color = "grey40", fill = "white")
ggplot(data = supermarket, aes(x = Revenue)) +
geom_freqpoly(bins = 100, color = "blue")
ggplot(data = supermarket, aes(x = Revenue)) +
geom_density(fill = "red", alpha = .5)
ggplot(data = supermarket, aes(x = `Product Family`)) +
geom_bar()
summary <- supermarket %>%
group_by(`Product Family`) %>%
tally()
ggplot(data = summary, aes(x = `Product Family`, y = n)) +
geom_bar(stat = "identity")
ggplot(data = supermarket, aes(x = `Product Family`)) +
geom_bar(fill = "dodgerblue", color = "grey40")
ggplot(data = supermarket, aes(x = `Product Family`)) +
geom_bar(fill = "dodgerblue", color = "grey40", width = .75)
ggplot(data = supermarket, aes(x = `Product Family`)) +
geom_bar(fill = "dodgerblue", color = "grey40", width = .99)
1. Assess the distribution of age, tenure, and gender in the facebook data.
2. Assess the frequency of age range, education, and income range in the reddit data.
ggplot(supermarket, aes(`Purchase Date`, Revenue)) +
geom_point()
ggplot(supermarket, aes(`Purchase Date`, Revenue)) +
geom_point(colour = "blue", size = 1, shape = 5)
ggplot(supermarket, aes(`Purchase Date`, Revenue)) +
geom_point(colour = "blue", alpha = .25)
ggplot(supermarket, aes(factor(`Units Sold`), Revenue)) +
geom_point()
ggplot(supermarket, aes(factor(`Units Sold`), Revenue)) +
geom_jitter(size = 1)
ggplot(supermarket, aes(factor(`Units Sold`), Revenue)) +
geom_jitter(size = 1, alpha = .1)
sales_by_date <- supermarket %>%
group_by(`Purchase Date`) %>%
summarise(Revenue = sum(Revenue, na.rm = TRUE))
ggplot(sales_by_date, aes(`Purchase Date`, Revenue)) +
geom_line()
sales_plot <- ggplot(sales_by_date, aes(`Purchase Date`, Revenue)) +
geom_line()
sales_plot + geom_smooth(span = .1)
sales_plot + geom_smooth(span = .9. se = FALSE)
sales_plot + geom_smooth(method = "lm", se = FALSE)
ggplot(supermarket, aes(factor(Children), Revenue)) +
geom_boxplot()
ggplot(supermarket, aes(factor(Children), Revenue)) +
geom_boxplot(notch = TRUE, fill = "blue", alpha = .25)
ggplot(supermarket, aes(factor(Children), Revenue)) +
geom_boxplot(outlier.color = "red", outlier.shape = 1)
Useful for smaller data sets like mpg
ggplot(mpg, aes(class, hwy)) +
geom_boxplot()
ggplot(mpg, aes(class, hwy)) +
geom_boxplot() +
geom_jitter(width = .2, alpha = .5)
ggplot(mpg, aes(class, hwy)) +
geom_violin()
Bar charts can have a y-axis different than just counts
ggplot(supermarket, aes(x = `Product Family`)) +
geom_bar()
prod_revenue <- supermarket %>%
group_by(`Product Family`) %>%
summarise(Revenue = sum(Revenue, na.rm = TRUE))
ggplot(prod_revenue, aes(x = `Product Family`, y = Revenue)) +
geom_bar(stat = "identity")
Assess the relationship between tenure and age, gender, likes, etc. in the facebook data.
ggplot(supermarket, aes(Revenue, color = `Product Family`)) +
geom_freqpoly()
ggplot(data = supermarket, aes(`Product Family`, fill = Gender)) +
geom_bar(position = "dodge")
ggplot(supermarket, aes(`Purchase Date`, Revenue, color = Country)) +
geom_point()
prod_revenue <- supermarket %>%
group_by(`Purchase Date`, `Product Family`) %>%
summarise(Revenue = sum(Revenue, na.rm = TRUE))
ggplot(prod_revenue, aes(`Purchase Date`, Revenue, color = `Product Family`)) +
geom_line(alpha = .2) +
geom_smooth(se = FALSE, span = .1)
ggplot(prod_revenue, aes(`Purchase Date`, Revenue)) +
geom_line(alpha = .2) +
geom_smooth(se = FALSE, span = .1) +
facet_wrap(~ `Product Family`)
ggplot(prod_revenue, aes(`Purchase Date`, Revenue)) +
geom_blank() +
facet_grid(.~ `Product Family`)
ggplot(prod_revenue, aes(`Purchase Date`, Revenue)) +
geom_blank() +
facet_grid(`Product Family` ~.)